In [1]:
# Imports & settings
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 160)

DATA_PATH = "../data/cleaned/all_parks_recreation_visits.csv"
In [2]:
#Load data & quick peek
df = pd.read_csv(DATA_PATH)
print("Shape:", df.shape)
df.head()
Shape: (88869, 8)
Out[2]:
Park Unit Code Park Type Region State Year Month Recreation Visits
0 Adams NHP ADAM National Historical Park Northeast MA 1979 1 0.0
1 Adams NHP ADAM National Historical Park Northeast MA 1979 2 0.0
2 Adams NHP ADAM National Historical Park Northeast MA 1979 3 0.0
3 Adams NHP ADAM National Historical Park Northeast MA 1979 4 965.0
4 Adams NHP ADAM National Historical Park Northeast MA 1979 5 2428.0
In [3]:
# enforce numeric
df["Year"] = pd.to_numeric(df["Year"], errors="coerce")
df["Month"] = pd.to_numeric(df["Month"], errors="coerce")
df["Recreation Visits"] = pd.to_numeric(df["Recreation Visits"], errors="coerce")
In [4]:
# drop invalid rows
df = df.dropna(subset=["Year","Month","Recreation Visits"])
df["Year"] = df["Year"].astype(int)
df["Month"] = df["Month"].astype(int)
df["Recreation Visits"] = df["Recreation Visits"].astype(int)
In [5]:
# date column for time analysis
df["Date"] = pd.to_datetime(dict(year=df["Year"], month=df["Month"], day=1))
In [6]:
# de-dup just in case
before = len(df)
df = df.drop_duplicates()
print(f"Rows after clean/dedup: {len(df)} (dropped {before - len(df)})")

df.describe(include="all")
Rows after clean/dedup: 88413 (dropped 456)
Out[6]:
Park Unit Code Park Type Region State Year Month Recreation Visits Date
count 88413 88413 88413 88413 87705 88413.000000 88413.000000 8.841300e+04 88413
unique 178 178 17 7 43 NaN NaN NaN NaN
top Adams NHP ADAM National Monument Intermountain FL NaN NaN NaN NaN
freq 552 552 20046 24168 5400 NaN NaN NaN NaN
mean NaN NaN NaN NaN NaN 2002.273252 6.500503 7.801331e+04 2002-09-24 09:39:30.886634368
min NaN NaN NaN NaN NaN 1979.000000 1.000000 0.000000e+00 1979-01-01 00:00:00
25% NaN NaN NaN NaN NaN 1991.000000 4.000000 3.135000e+03 1991-06-01 00:00:00
50% NaN NaN NaN NaN NaN 2003.000000 7.000000 1.218200e+04 2003-02-01 00:00:00
75% NaN NaN NaN NaN NaN 2014.000000 10.000000 5.694400e+04 2014-04-01 00:00:00
max NaN NaN NaN NaN NaN 2024.000000 12.000000 3.912215e+06 2024-12-01 00:00:00
std NaN NaN NaN NaN NaN 13.202178 3.451972 2.103367e+05 NaN
In [7]:
# Sanity checks (missing, ranges)
print("Year range:", df["Year"].min(), "→", df["Year"].max())
print("Months unique:", sorted(df["Month"].unique()))
print("Parks:", df["Park"].nunique(), "| States:", df["State"].nunique(), "| Regions:", df["Region"].nunique())

# Basic missing check (should be none after dropna)
df.isna().sum()
Year range: 1979 → 2024
Months unique: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12)]
Parks: 178 | States: 43 | Regions: 7
Out[7]:
Park                   0
Unit Code              0
Park Type              0
Region                 0
State                708
Year                   0
Month                  0
Recreation Visits      0
Date                   0
dtype: int64
In [8]:
#Totals by year (interactive)
yearly = df.groupby("Year", as_index=False)["Recreation Visits"].sum()

fig = px.line(
    yearly, x="Year", y="Recreation Visits",
    title="Total Recreation Visits per Year (All Parks)",
    markers=True
)
fig.update_layout(yaxis_title="Visits", xaxis_title="Year")
fig.show()
In [9]:
# Seasonality (all parks combined)
monthly = df.groupby("Month", as_index=False)["Recreation Visits"].sum().sort_values("Month")

fig = px.line(
    monthly, x="Month", y="Recreation Visits",
    title="Seasonality – Total Visits by Month (All Parks)",
    markers=True
)
fig.update_xaxes(dtick=1)
fig.show()
In [10]:
# Top 15 parks (all time)
top_parks = (
    df.groupby("Park", as_index=False)["Recreation Visits"].sum()
      .sort_values("Recreation Visits", ascending=False)
      .head(15)
)

fig = px.bar(
    top_parks, x="Recreation Visits", y="Park",
    orientation="h", title="Top 15 Parks by Total Visits (All Years)"
)
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()
In [11]:
# YoY growth per park, latest month per park ---

# 1) Monthly panel per park
park_month = (
    df.groupby(["Park","Year","Month"], as_index=False)["Recreation Visits"].sum()
)
park_month["Date"] = pd.to_datetime(dict(year=park_month["Year"], month=park_month["Month"], day=1))
In [12]:
# Previous-year same-month baseline
park_month = park_month.sort_values(["Park","Date"])
park_month["Prev_Year_Visits"] = park_month.groupby("Park")["Recreation Visits"].shift(12)
In [13]:
# YoY growth
park_month["YoY_Growth"] = (
    (park_month["Recreation Visits"] - park_month["Prev_Year_Visits"]) / park_month["Prev_Year_Visits"]
)
In [14]:
# Get the latest row per park (no groupby-apply, so no warning)
latest_idx = park_month.groupby("Park")["Date"].idxmax()
latest = park_month.loc[latest_idx].copy()
In [15]:
# Keep only parks with a valid YoY baseline
latest = latest.replace([np.inf, -np.inf], np.nan).dropna(subset=["YoY_Growth"])

# 6) Top gainers / decliners
top_gainers   = latest.sort_values("YoY_Growth", ascending=False).head(10)
top_decliners = latest.sort_values("YoY_Growth", ascending=True).head(10)

print("Top YoY Gainers:")
display(top_gainers[["Park","Year","Month","Recreation Visits","Prev_Year_Visits","YoY_Growth"]])

print("\nTop YoY Decliners:")
display(top_decliners[["Park","Year","Month","Recreation Visits","Prev_Year_Visits","YoY_Growth"]])
Top YoY Gainers:
Park Year Month Recreation Visits Prev_Year_Visits YoY_Growth
5387 Arlington House The R.E. Lee MEM 2024 12 18689 6064.0 2.081959
76496 Great Basin NP 2024 12 3693 1260.0 1.930952
14723 Boston Harbor Islands NRA 2024 12 2500 950.0 1.631579
24763 Casa Grande Ruins NM 2024 12 9499 3963.0 1.396922
86204 Hopewell Furnace NHS 2024 12 2031 870.0 1.334483
13679 Bluestone NSR 2024 12 639 300.0 1.130000
27667 Chaco Culture NHP 2024 12 1984 937.0 1.117396
27115 Cesar E. Chavez NM 2024 12 1658 846.0 0.959811
7595 Bandelier NM 2024 12 17514 9140.0 0.916193
66854 Gauley River NRA 2024 12 5484 2929.0 0.872311
Top YoY Decliners:
Park Year Month Recreation Visits Prev_Year_Visits YoY_Growth
44502 Edgar Allan Poe NHS 2024 12 0 1177.0 -1.000000
40038 De Soto NMEM 2024 12 0 18207.0 -1.000000
67406 General Grant NMEM 2024 12 1156 8184.0 -0.858749
23563 Carl Sandburg Home NHS 2024 12 1598 9455.0 -0.830989
8255 Bent's Old Fort NHS 2024 12 156 638.0 -0.755486
1667 Andrew Johnson NHS 2024 12 2190 8597.0 -0.745260
68510 George Washington Birthplace NM 2024 12 765 2220.0 -0.655405
81824 Hampton NHS 2024 12 748 1756.0 -0.574032
36378 Crater Lake NP 2024 12 5005 11432.0 -0.562194
87860 Hovenweep NM 2024 12 318 700.0 -0.545714
In [16]:
import plotly.express as px

fig = px.bar(
    top_gainers.sort_values("YoY_Growth"),
    x="YoY_Growth", y="Park", orientation="h",
    title="Top 10 Parks – Latest YoY Growth"
)
fig.update_layout(xaxis_tickformat=".0%")
fig.show()
In [17]:
# State & Region summaries (for map and dashboard)
# State-month
state_month = (
    df.groupby(["State","Year","Month"], as_index=False)["Recreation Visits"].sum()
)
state_month["Date"] = pd.to_datetime(dict(year=state_month["Year"], month=state_month["Month"], day=1))
In [18]:
# Region-month
region_month = (
    df.groupby(["Region","Year","Month"], as_index=False)["Recreation Visits"].sum()
)
region_month["Date"] = pd.to_datetime(dict(year=region_month["Year"], month=region_month["Month"], day=1))
In [19]:
# (map & filters)
state_month.to_csv("../data/cleaned/state_month_visits.csv", index=False)
region_month.to_csv("../data/cleaned/region_month_visits.csv", index=False)

state_month.head(), region_month.head()
Out[19]:
(  State  Year  Month  Recreation Visits       Date
 0    AK  1979      1                502 1979-01-01
 1    AK  1979      2                193 1979-02-01
 2    AK  1979      3                720 1979-03-01
 3    AK  1979      4               1127 1979-04-01
 4    AK  1979      5              17425 1979-05-01,
    Region  Year  Month  Recreation Visits       Date
 0  Alaska  1979      1                502 1979-01-01
 1  Alaska  1979      2                193 1979-02-01
 2  Alaska  1979      3                720 1979-03-01
 3  Alaska  1979      4               1127 1979-04-01
 4  Alaska  1979      5              17425 1979-05-01)
In [20]:
# Seasonality index (per park)
park_month_total = (
    df.groupby(["Park","Month"], as_index=False)["Recreation Visits"].sum()
)

park_total_year = (
    df.groupby("Park", as_index=False)["Recreation Visits"].sum()
      .rename(columns={"Recreation Visits":"Total_Visits"})
)
In [21]:
# average per month = total / number of unique months present for the park
months_per_park = df.groupby("Park")["Month"].nunique().reset_index().rename(columns={"Month":"Months_Count"})
park_total_year = park_total_year.merge(months_per_park, on="Park", how="left")

park_month_avg = park_month_total.merge(park_total_year, on="Park", how="left")
park_month_avg["Avg_Per_Month"] = park_month_avg["Total_Visits"] / park_month_avg["Months_Count"]
In [22]:
#  index: month_visits / avg_per_month
park_month_avg["Seasonality_Index"] = park_month_avg["Recreation Visits"] / park_month_avg["Avg_Per_Month"]

# 
park_month_avg.to_csv("../data/cleaned/park_seasonality_index.csv", index=False)
park_month_avg.head()
Out[22]:
Park Month Recreation Visits Total_Visits Months_Count Avg_Per_Month Seasonality_Index
0 Adams NHP 1 37206 4931861 12 410988.416667 0.090528
1 Adams NHP 2 63858 4931861 12 410988.416667 0.155377
2 Adams NHP 3 66681 4931861 12 410988.416667 0.162245
3 Adams NHP 4 245440 4931861 12 410988.416667 0.597194
4 Adams NHP 5 540491 4931861 12 410988.416667 1.315100
In [23]:
# Heatmap for one park example:
one = park_month_avg[park_month_avg["Park"] == park_month_avg["Park"].iloc[0]]
fig = px.bar(one, x="Month", y="Seasonality_Index", title=f"Seasonality Index – {one['Park'].iloc[0]}")
fig.update_xaxes(dtick=1)
fig.show()
In [24]:
# Recent period focus (last 24 months)
cutoff = df["Date"].max() - pd.DateOffset(months=24)
recent = df[df["Date"] >= cutoff].copy()

recent_yearly = recent.groupby("Year", as_index=False)["Recreation Visits"].sum()
px.bar(recent_yearly, x="Year", y="Recreation Visits", title="Visits – Last 24 Months").show()

recent_monthly_state = (
    recent.groupby(["State","Date"], as_index=False)["Recreation Visits"].sum()
)
recent_monthly_state.sort_values("Date").head()
Out[24]:
State Date Recreation Visits
0 AK 2022-12-01 3990
825 TN 2022-12-01 735091
50 AR 2022-12-01 239688
800 SD 2022-12-01 6266
775 SC 2022-12-01 47892
In [25]:
# Save EDA artifacts for the dashboard
df.to_csv("../data/cleaned/_eda_base_all_rows.csv", index=False)
yearly.to_csv("../data/cleaned/_eda_yearly_totals.csv", index=False)
monthly.to_csv("../data/cleaned/_eda_monthly_totals.csv", index=False)
top_parks.to_csv("../data/cleaned/_eda_top_parks_total.csv", index=False)
park_month.to_csv("../data/cleaned/_eda_park_month_panel.csv", index=False)
latest.to_csv("../data/cleaned/_eda_latest_yoy_per_park.csv", index=False)
In [26]:
# Quick data-quality flags
# Impossible month values?
print("Bad months:", df.loc[~df["Month"].between(1,12)].shape[0])

# Extremely large spikes (top 0.1%)
cap = df["Recreation Visits"].quantile(0.999)
spikes = df[df["Recreation Visits"] >= cap].sort_values("Recreation Visits", ascending=False)
print(f"Extreme spikes (>= {int(cap)}):", len(spikes))
spikes.head(10)
Bad months: 0
Extreme spikes (>= 2205084): 89
Out[26]:
Park Unit Code Park Type Region State Year Month Recreation Visits Date
66381 Gateway NRA GATE National Recreation Area Northeast NY 1979 7 3912215 1979-07-01
66405 Gateway NRA GATE National Recreation Area Northeast NY 1981 7 3493360 1981-07-01
66417 Gateway NRA GATE National Recreation Area Northeast NY 1982 7 3394057 1982-07-01
12849 Blue Ridge PKWY BLRI National Parkway Southeast NC 1987 10 3181024 1987-10-01
66429 Gateway NRA GATE National Recreation Area Northeast NY 1983 7 3150909 1983-07-01
66393 Gateway NRA GATE National Recreation Area Northeast NY 1980 7 3071490 1980-07-01
13026 Blue Ridge PKWY BLRI National Parkway Southeast NC 2002 7 2899584 2002-07-01
66394 Gateway NRA GATE National Recreation Area Northeast NY 1980 8 2883821 1980-08-01
12861 Blue Ridge PKWY BLRI National Parkway Southeast NC 1988 10 2854160 1988-10-01
12824 Blue Ridge PKWY BLRI National Parkway Southeast NC 1985 9 2827523 1985-09-01
In [27]:
# Hotspot vs Off-Season Labeling by State
# Hotspot vs Off-Season labeling ---

selected_year  = 2024
selected_month = 7

# Filter for that month
month_df = df[(df["Year"] == selected_year) & (df["Month"] == selected_month)]

# Aggregate at state level
state_avg = (
    month_df.groupby("State", as_index=False)["Recreation Visits"].mean()
)

# Determine thresholds
hot_threshold = state_avg["Recreation Visits"].quantile(0.75)
cold_threshold = state_avg["Recreation Visits"].quantile(0.25)
In [28]:
# Assign category
state_avg["Season_Label"] = state_avg["Recreation Visits"].apply(
    lambda x: "Hotspot" if x >= hot_threshold else ("Off-Season" if x <= cold_threshold else "Moderate")
)

# Visualize
import plotly.express as px

fig = px.choropleth(
    state_avg,
    locations="State", locationmode="USA-states",
    color="Season_Label",
    title=f"State Demand Classification ({selected_year}-{selected_month:02d})",
    scope="usa",
    color_discrete_map={"Hotspot":"red","Moderate":"orange","Off-Season":"blue"}
)
fig.show()
In [29]:
#  Create monthly series per park
park_ts = (
    df.groupby(["Park","Year","Month"], as_index=False)["Recreation Visits"].sum()
)
park_ts["Date"] = pd.to_datetime(dict(year=park_ts["Year"], month=park_ts["Month"], day=1))
In [30]:
# Sort and compute rolling sum
park_ts = park_ts.sort_values(["Park","Date"])
park_ts["Rolling_12M"] = park_ts.groupby("Park")["Recreation Visits"].transform(lambda x: x.rolling(12, min_periods=6).sum())

# 3) Compute month-to-month growth
park_ts["Rolling_Growth"] = park_ts.groupby("Park")["Rolling_12M"].pct_change(fill_method=None)
In [31]:
# Get latest available growth
latest_rows = park_ts.groupby("Park")["Date"].idxmax()
latest_growth = park_ts.loc[latest_rows].dropna(subset=["Rolling_Growth"])

# Top trending
top_trending = latest_growth.sort_values("Rolling_Growth", ascending=False).head(10)

print("Top 10 Trending Parks (12-Month Rolling Growth):")
display(top_trending[["Park","Rolling_12M","Rolling_Growth"]])
Top 10 Trending Parks (12-Month Rolling Growth):
Park Rolling_12M Rolling_Growth
563 Amache NHS 4771.0 0.088772
14723 Boston Harbor Islands NRA 28157.0 0.058255
24763 Casa Grande Ruins NM 104429.0 0.055980
9563 Big Cypress NPRES 2216708.0 0.053804
13679 Bluestone NSR 7327.0 0.048512
19183 Canyon de Chelly NM 387433.0 0.047855
28890 Charles Young Buffalo Soldiers NM 4716.0 0.044287
18631 Cane River Creole NHP 12592.0 0.043421
7595 Bandelier NM 213595.0 0.040805
27115 Cesar E. Chavez NM 26641.0 0.031438
In [32]:
# Chart
fig = px.bar(
    top_trending.sort_values("Rolling_Growth"),
    x="Rolling_Growth", y="Park", orientation="h",
    title="Top 10 Trending Parks (Rolling 12M Growth)"
)
fig.update_layout(xaxis_tickformat=".0%")
fig.show()